> project_summary = "/Users/rory/cache/yimlamai-hepatocytes/2016-08-29_yimlamai/project-summary.csv"
> counts_file = "/Users/rory/cache/yimlamai-hepatocytes/2016-08-29_yimlamai/combined.counts"
> tx2genes_file = "/Users/rory/cache/yimlamai-hepatocytes/2016-08-29_yimlamai/tx2gene.csv"
> cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2",
+ "#D55E00", "#CC79A7")
> summarydata = read.table(project_summary, header = TRUE, sep = ",")
> summarydata = summarydata[, colSums(is.na(summarydata)) < nrow(summarydata)]
> # handle newer bcbio-nextgen runs that use description as the key
> if ("description" %in% colnames(summarydata)) {
+ rownames(summarydata) = summarydata$description
+ summarydata$Name = rownames(summarydata)
+ summarydata$description = NULL
+ } else {
+ rownames(summarydata) = summarydata$Name
+ # summarydata$Name = NULL
+ }
> summarydata = summarydata[order(rownames(summarydata)), ]
> if (file.exists(tx2genes_file)) {
+ sample_dirs = file.path(dirname(project_summary), "..", rownames(summarydata))
+ salmon_files = file.path(sample_dirs, "salmon", "quant.sf")
+ sailfish_files = file.path(sample_dirs, "sailfish", "quant.sf")
+ new_sailfish = file.path(sample_dirs, "sailfish", "quant", "quant.sf")
+ new_salmon = file.path(sample_dirs, "salmon", "quant", "quant.sf")
+ if (file.exists(salmon_files[1])) {
+ sf_files = salmon_files
+ } else if (file.exists(sailfish_files[1])) {
+ sf_files = sailfish_files
+ } else if (file.exists(new_sailfish[1])) {
+ sf_files = new_sailfish
+ } else if (file.exists(new_salmon[1])) {
+ sf_files = new_salmon
+ }
+ names(sf_files) = rownames(summarydata)
+ tx2gene = read.table(tx2genes_file, sep = ",", row.names = NULL, header = FALSE)
+ txi.salmon = tximport(sf_files, type = "salmon", tx2gene = tx2gene, reader = readr::read_tsv,
+ countsFromAbundance = "lengthScaledTPM")
+ counts = round(data.frame(txi.salmon$counts, check.names = FALSE))
+ } else {
+ counts = read.table(counts_file, header = TRUE, row.names = "id", check.names = FALSE)
+ }
> counts = counts[, order(colnames(counts)), drop = FALSE]
> colnames(counts) = gsub(".counts", "", colnames(counts))
>
> # this is a list of all non user-supplied metadata columns that could appear
> known_columns = c("Name", "X.GC", "Exonic.Rate", "Sequences.flagged.as.poor.quality",
+ "rRNA_rate", "Fragment.Length.Mean", "Intronic.Rate", "Intergenic.Rate",
+ "Mapping.Rate", "Quality.format", "Duplication.Rate.of.Mapped", "Mapped",
+ "rRNA", "Sequence.length", "Transcripts.Detected", "Mean.Per.Base.Cov.",
+ "Genes.Detected", "Unique.Starts.Per.Read", "unique_starts_per_read", "complexity",
+ "X5.3.bias", "Duplicates.pct", "Duplicates", "Mapped.reads", "Average.insert.size",
+ "Mapped.reads.pct", "Total.reads", "avg_coverage_per_region", "Mapped.Reads")
> summarydata[, "Fragment.Length.Mean"] = summarydata$Average.insert.size
> summarydata$gt = paste(summarydata$genotype, summarydata$treatment, sep = "_")
> metadata = summarydata[, !colnames(summarydata) %in% known_columns, drop = FALSE]
> metadata = metadata[, colSums(is.na(metadata)) < nrow(metadata), drop = FALSE]
> sanitize_datatable = function(df, ...) {
+ # remove dashes which cause wrapping
+ DT::datatable(df, ..., rownames = gsub("-", "_", rownames(df)), colnames = gsub("-",
+ "_", colnames(df)))
+ }
> # set seed for reproducibility
> set.seed(1454944673)
> get_heatmap_fn = function(summarydata) {
+ # return the pheatmap function with or without metadata
+ if (ncol(metadata) == 0) {
+ return(pheatmap)
+ } else {
+ # rownames(metadata) = summarydata$Name
+ heatmap_fn = function(data, ...) {
+ pheatmap(data, annotation = metadata, clustering_method = "ward.D2",
+ clustering_distance_cols = "correlation", ...)
+ }
+ return(heatmap_fn)
+ }
+ }
> heatmap_fn = get_heatmap_fn(summarydata)
> qualimap_run = "Mapped" %in% colnames(summarydata)
> do_quality = "Mapped.reads" %in% colnames(summarydata)
Mapped reads looks great, there is some variation but that is to be expected.
> ggplot(summarydata, aes(x = Name, y = Mapped)) + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")
> ggplot(summarydata, aes(x = Name, y = Mapped.reads)) + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")
The genomic mapping rate looks excellent for all of the samples.
> ggplot(summarydata, aes(x = Name, y = Mapping.Rate)) + geom_bar(stat = "identity") +
+ ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90))
> ggplot(summarydata, aes(x = Name, y = Mapped.reads.pct)) + geom_bar(stat = "identity") +
+ ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90))
We see around the same number of genes detected in each sample, which is another good sign.
> dd = data.frame(Name = colnames(counts), Genes.Detected = colSums(counts > 0))
> ggplot(dd, aes(x = Name, y = Genes.Detected)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("genes detected") +
+ xlab("")
The exonic mapping rate is a measurement of how much RNA we actually sequenced; if the genomic mapping rate is super high and the exonic mapping rate is low, we have DNA contamination. There are a couple samples that look like they might have some DNA contamination, but nothing absolutely horrible.
> ggplot(summarydata, aes(x = Name, y = Exonic.Rate)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("exonic mapping rate") +
+ xlab("")
These rates are low for all samples, which is good.
> eval_rRNA = "rRNA_rate" %in% colnames(summarydata) & !sum(is.na(summarydata$rRNA_rate)) ==
+ nrow(summarydata)
> ggplot(summarydata, aes(x = Name, y = rRNA_rate)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("rRNA rate") +
+ xlab("")
This is round 1, which is good– if it deviates much from 1 then it is indicative of degradation of the RNA.
> ggplot(summarydata, aes(x = Name, y = X5.3.bias)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("5'->3' bias") +
+ xlab("")
The distributions of counts looks pretty similar across all of the samples.
> melted = melt(counts)
> colnames(melted) = c("sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
Normalizing makes them much more similar, which is good. Trimmed mean of M-values (TMM) normalization is described here
Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biology, 11(3). doi:10.1186/gb-2010-11-3-r25
> y = DGEList(counts = counts)
> y = calcNormFactors(y)
> normalized_counts = cpm(y, normalized.lib.sizes = TRUE)
> melted = melt(normalized_counts)
> colnames(melted) = c("gene", "sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
Samples of the same genotype and treatment cluster together by Spearman correlation which is a good sign.
> heatmap_fn(cor(normalized_counts, method = "pearson"))
> heatmap_fn(cor(normalized_counts, method = "spearman"))
Samples separate very cleanly along the 1st and 2nd principal components by genotype and treatment. This is good news for differential expression analyses.
> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = ~Name)
> vst = varianceStabilizingTransformation(dds)
> pca_loadings = function(object, ntop = 500) {
+ rv <- matrixStats::rowVars(assay(object))
+ select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
+ pca <- prcomp(t(assay(object)[select, ]))
+ percentVar <- pca$sdev^2/sum(pca$sdev^2)
+ names(percentVar) = colnames(pca$x)
+ pca$percentVar = percentVar
+ return(pca)
+ }
> pc = pca_loadings(vst)
> comps = data.frame(pc$x)
> comps$Name = rownames(comps)
> library(dplyr)
> comps = comps %>% left_join(summarydata, by = c(Name = "Name"))
> colorby = "Name"
> pca_plot = function(comps, nc1, nc2, colorby) {
+ c1str = paste0("PC", nc1)
+ c2str = paste0("PC", nc2)
+ if (!(c1str %in% colnames(comps) && c2str %in% colnames(comps))) {
+ warning("Higher order components not found, skipping plotting.")
+ return(NA)
+ }
+ ggplot(comps, aes_string(c1str, c2str, color = colorby)) + geom_point() +
+ theme_bw() + xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100),
+ "% variance")) + ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] *
+ 100), "% variance"))
+ }
> pca_plot(comps, 1, 2, "gt")
> pca_plot(comps, 3, 4, "gt")
> pca_plot(comps, 5, 6, "gt")
> ggplot(data.frame(component = reorder(names(pc$percentVar), -pc$percentVar),
+ percent_var = pc$percentVar), aes(component, percent_var)) + geom_bar(stat = "identity") +
+ ylab("percent of total variation") + xlab("") + theme_bw()
> # snagged from development version of DESeq
> DESeqDataSetFromTximport <- function(txi, colData, design, ...) {
+ counts <- round(txi$counts)
+ mode(counts) <- "integer"
+ dds <- DESeqDataSetFromMatrix(countData = counts, colData = colData, design = design,
+ ...)
+ stopifnot(txi$countsFromAbundance %in% c("no", "scaledTPM", "lengthScaledTPM"))
+ if (txi$countsFromAbundance %in% c("scaledTPM", "lengthScaledTPM")) {
+ message("using length scaled TPM counts from tximport")
+ } else {
+ message("using counts and average transcript lengths from tximport")
+ lengths <- txi$length
+ dimnames(lengths) <- dimnames(dds)
+ assays(dds)[["avgTxLength"]] <- lengths
+ }
+ return(dds)
+ }
>
> subset_tximport = function(txi, rows, columns) {
+ txi$counts = txi$counts[rows, columns]
+ txi$abundance = txi$abundance[rows, columns]
+ txi$length = txi$length[rows, columns]
+ return(txi)
+ }
> deseq2resids = function(dds) {
+ # calculate residuals for a deseq2 fit
+ fitted = t(t(assays(dds)[["mu"]])/sizeFactors(dds))
+ return(counts(dds, normalized = TRUE) - fitted)
+ }
> library(DEGreport)
> library(vsn)
> design = ~genotype + treatment + genotype:treatment
> summarydata$treatment = relevel(summarydata$treatment, "untreated")
> summarydata$genotype = relevel(summarydata$genotype, "wt")
Here we fit a model that looks like ~genotype + treatment + genotype:treatment. The coefficient of genotype will be the effect due to it being a knockout, when there has been no treatment. The coefficient of treatment will be the effect of effect of treatment on the control cells. The coefficient of genotype:treatment will be the effect of treatment on the knockout cells.
> counts <- counts[rowSums(counts > 0) > 1, ]
> if (exists("txi.salmon")) {
+ txi.salmon = subset_tximport(txi.salmon, rownames(counts), colnames(counts))
+ dds = DESeqDataSetFromTximport(txi.salmon, colData = summarydata, design = design)
+ } else {
+ dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata,
+ design = design)
+ }
> geoMeans = apply(counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row !=
+ 0]))))
> dds = estimateSizeFactors(dds, geoMeans = geoMeans)
> dds = DESeq(dds)
These dispersion estimates look normal– the red line is the fitted value for the mean-variance relationship, the black dots are the gene-wise dispersion values and the blue dots are the gene-wise dispersions shrunk back to the fitted line.
> plotDispEsts(dds)
> library(biomaRt)
> biomart_dataset = "mmusculus_gene_ensembl"
> mart = biomaRt::useMart(biomart = "ensembl", dataset = biomart_dataset)
> symbols = biomaRt::getBM(attributes = c("ensembl_gene_id", "mgi_symbol"), mart = mart)
> entrez = biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene"), mart = mart)
> entrez$entrezgene = as.character(entrez$entrezgene)
>
> plotMA_full = function(res) {
+ ymax = max(res$log2FoldChange, na.rm = TRUE)
+ ymin = min(res$log2FoldChange, na.rm = TRUE)
+ plotMA(res, ylim = c(ymin, ymax))
+ }
> volcano = function(res) {
+ stats = as.data.frame(res[, c(2, 6)])
+ p = volcano_density_plot(stats, lfc.cutoff = 1.5)
+ print(p)
+ }
> markup_deseq = function(res, fname) {
+ out_df = as.data.frame(res)
+ out_df$id = rownames(out_df)
+ out_df = out_df[, c("id", colnames(out_df)[colnames(out_df) != "id"])]
+ out_df = out_df %>% left_join(symbols, by = c(id = "ensembl_gene_id")) %>%
+ arrange(padj)
+ return(out_df)
+ }
> write_deseq = function(res, fname) {
+ out_df = as.data.frame(res)
+ out_df$id = rownames(out_df)
+ out_df = out_df[, c("id", colnames(out_df)[colnames(out_df) != "id"])]
+ out_df = out_df %>% left_join(symbols, by = c(id = "ensembl_gene_id")) %>%
+ arrange(padj)
+ write.table(out_df, file = fname, quote = FALSE, sep = ",", row.names = FALSE,
+ col.names = TRUE)
+ }
> of_interest = c("Tgfb1", "Tgfb2", "Pdgfc", "Il6")
> ko = results(dds, name = "genotype_yaptaz_vs_wt")
> komarked = markup_deseq(ko)
> plotMA_full(ko)
> volcano(ko)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1119]
> write_deseq(ko, "knockout-effect.csv")
There are 308 genes affected by the knockout.
> treatwt = results(dds, name = "treatment_ccl4_vs_untreated")
> treatwtmarked = markup_deseq(treatwt)
> plotMA_full(treatwt)
> volcano(treatwt)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1286]
> write_deseq(treatwt, "treatment-effect-on-wildtype.csv")
There are 513 genes affected by the CCL4 treatment in the wildtype.
> treatko = results(dds, list(c("treatment_ccl4_vs_untreated", "genotypeyaptaz.treatmentccl4")))
> treatkomarked = markup_deseq(treatko)
> plotMA_full(treatko)
> volcano(treatko)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1453]
> write_deseq(treatko, "treatment-effect-on-knockout.csv")
There are 3176 genes affected by the CCL4 treatment in the knockout.
> treatkospecific = results(dds, name = "genotypeyaptaz.treatmentccl4")
> plotMA_full(treatkospecific)
> volcano(treatkospecific)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1620]
> treatkospecificmarked = markup_deseq(treatkospecific)
> write_deseq(treatkospecific, "specific-effect-on-knockout.csv")
There are 796 genes specifically affected differently by the CCL4 treatment in the knockout compared to the wildtype.
These tables have the format:
id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,mgi_symbol
> orgdb = "org.Mm.eg.db"
> biomart_dataset = "mmusculus_gene_ensembl"
> keggname = "mmu"
> library(dplyr)
> library(clusterProfiler)
> library(orgdb, character.only = TRUE)
> library(biomaRt)
> mart = biomaRt::useMart(biomart = "ensembl", dataset = biomart_dataset)
> entrez = biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene"), mart = mart)
> entrez$entrezgene = as.character(entrez$entrezgene)
> summarize_cp = function(res, comparison) {
+ summaries = data.frame()
+ for (ont in names(res)) {
+ ontsum = summary(res[[ont]])
+ ontsum$ont = ont
+ summaries = rbind(summaries, ontsum)
+ }
+ summaries$comparison = comparison
+ return(summaries)
+ }
>
> enrich_cp = function(res, comparison) {
+ res = res %>% data.frame() %>% tibble::rownames_to_column() %>% left_join(entrez,
+ by = c(rowname = "ensembl_gene_id")) %>% filter(!is.na(entrezgene))
+ universe = res$entrezgene
+ genes = subset(res, padj < 0.05)$entrezgene
+ mf = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "MF", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ cc = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "CC", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ bp = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "BP", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ kg = enrichKEGG(gene = genes, universe = universe, organism = "mmu", pvalueCutoff = 1,
+ qvalueCutoff = 1, pAdjustMethod = "BH")
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg)
+ all[["summary"]] = summarize_cp(all, comparison)
+ return(all)
+ }
> gsea_cp = function(res, comparison) {
+ res = res %>% left_join(entrez, by = c(rowname = "ensembl_gene_id")) %>%
+ filter(!is.na(entrezgene)) %>% filter(!is.na(log2FoldChange)) %>% filter(!is.na(lfcSE))
+ lfc = data.frame(res)[, "log2FoldChange"]
+ lfcse = data.frame(res)[, "lfcSE"]
+ genes = lfc/lfcse
+ names(genes) = res$entrezgene
+ genes = genes[order(genes, decreasing = TRUE)]
+ cc = gseGO(genes, ont = "CC", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ mf = gseGO(genes, ont = "MF", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ bp = gseGO(genes, ont = "bp", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ genes = data.frame(res)[, "log2FoldChange"]
+ names(genes) = res$entrezgene
+ genes = genes[order(genes, decreasing = TRUE)]
+ genes = genes[!is.na(genes)]
+ kg = gseKEGG(geneList = genes, organism = keggname, nPerm = 500, pvalueCutoff = 1,
+ verbose = TRUE)
+ if (orgdb == "org.Hs.eg.db") {
+ do = summary(gseDO(geneList = genes, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE))
+ do$ont = "DO"
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg, do = do)
+ } else {
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg)
+ }
+ all[["summary"]] = summarize_cp(all, comparison)
+ return(all)
+ }
> library(readr)
> ko_enrich = enrich_cp(ko, "knockout-effect")
> write_csv(subset(ko_enrich$summary, qvalue < 0.05), "knockout-effect-pathways.csv")
> treatwt_enrich = enrich_cp(treatwt, "treatment-effect-on-wt")
> write_csv(subset(treatwt_enrich$summary, qvalue < 0.05), "treatment-effect-on-wt-pathways.csv")
> treatko_enrich = enrich_cp(treatko, "treatment-effect-on-ko")
> write_csv(subset(treatko_enrich$summary, qvalue < 0.05), "treatment-effect-on-ko-pathways.csv")
> treatkospecific_enrich = enrich_cp(treatkospecific, "specific-treatment-effect-on-ko")
> write_csv(subset(treatkospecific_enrich$summary, qvalue < 0.05), "specific-effect-on-ko-pathways.csv")
Treatment effect on wildtype, pathways
> tpm = txi.salmon$abundance %>% as.data.frame() %>% tibble::rownames_to_column() %>%
+ left_join(symbols, by = c(rowname = "ensembl_gene_id"))
> write_csv(tpm, "tpm.csv")
Here are two Il1b plots, one with the huge outlier and one without.
> summarydata$sample = rownames(summarydata)
> il6vals = tpm %>% dplyr::filter(mgi_symbol == "Il1b") %>% tidyr::gather(sample,
+ tpm, -mgi_symbol, -rowname) %>% left_join(summarydata, by = "sample")
> ggplot(il6vals, aes(sample, tpm)) + geom_point() + theme(axis.text.x = element_text(angle = 90,
+ hjust = 1))
> ggplot(subset(il6vals, tpm < 50), aes(sample, tpm)) + geom_point() + theme(axis.text.x = element_text(angle = 90,
+ hjust = 1))